In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from load_utils import *
from analysis_utils import compare_groups,get_genders
In [2]:
d = load_diffs()
df_events, df_blocked_user_text = load_block_events_and_users()
In [3]:
d['blocked'].groupby('user_text')['pred_aggression_score']\
.agg( {'aggressiveness': np.mean})\
.hist(bins = 50)
plt.xlabel('user level aggression score')
plt.ylabel('num users')
plt.title('')
Out[3]:
In [4]:
# lets exclude anons
d['blocked'].query('not author_anon').groupby('user_text')['pred_aggression_score']\
.agg( {'aggressiveness': np.mean})\
.hist(bins = 50)
plt.xlabel('user level aggression score')
plt.ylabel('num users')
plt.title('')
Out[4]:
In [5]:
# lets compare to non-blocked users
# NOTE: would be better to have taken a random sample of users
d['2015'].query('not author_anon').groupby('user_text')['pred_aggression_score']\
.agg( {'aggressiveness': np.mean})\
.hist(bins = 50)
plt.xlabel('user level aggression score')
plt.ylabel('num users')
plt.title('')
Out[5]:
In [6]:
d['blocked'].groupby('user_text')['pred_recipient_score']\
.agg( {'aggressiveness': np.mean}).hist(bins = 30)
plt.xlabel('user level attack score')
plt.ylabel('num users')
plt.title('')
Out[6]:
In [7]:
d['blocked'].query('not author_anon').groupby('user_text')['pred_recipient_score']\
.agg( {'aggressiveness': np.mean}).hist(bins = 30)
plt.xlabel('user level attack score')
plt.ylabel('num users')
plt.title('')
Out[7]:
In [8]:
d['2015'].query('not author_anon').groupby('user_text')['pred_recipient_score']\
.agg( {'aggressiveness': np.mean})\
.hist(bins = 50)
plt.xlabel('user level attack score')
plt.ylabel('num users')
plt.title('')
Out[8]:
In [9]:
# TODO
In [10]:
# TODO
In [11]:
o = (False, True)
x = 'author_anon'
compare_groups(d['sample'][:100000], x, order = o)
In [12]:
# don't count posts to own article
o = (False, True)
x = 'recipient_anon'
compare_groups(d['sample'][:100000].query('not own_page'), x, order = o)
In [13]:
x = 'own_page'
o = (False, True)
compare_groups(d['sample'][:100000], x, order = o)
In [14]:
x = 'own_page'
compare_groups(d['sample'][:100000], x, order = o, hue = 'author_anon')
In [15]:
d_gender = get_genders(d['sample'])
In [16]:
o = ('unknown: registered', 'male', 'female')
x = 'author_gender'
compare_groups(d_gender, x, order = o)
In [17]:
o = ('unknown: registered', 'male', 'female')
x = 'recipient_gender'
compare_groups(d_gender.query('not own_page'), x, order= o)
In [18]:
o = ('unknown: registered', 'male', 'female')
x = 'author_gender'
compare_groups(d_gender.query("not own_page and recipient_gender != 'unknown:anon'"), x, order = o, hue = 'recipient_gender')
In [19]:
thresholds = np.percentile(d['2015']['user_text'].value_counts(), np.arange(0, 100.01,0.5 ))
thresholds = sorted(set(thresholds.astype(int)))
In [20]:
bins = []
for i in range(len(thresholds)-1):
label = '%d-%d' % (thresholds[i], thresholds[i+1]-1)
rnge = range(thresholds[i], thresholds[i+1])
bins.append((label, rnge))
In [21]:
def map_count(x):
for label, rnge in bins:
if x in rnge:
return label
d_temp = d['2015'].query('not author_anon')\
.groupby('user_text')['pred_aggression_score']\
.agg( {'aggressiveness': np.mean, 'count': len})\
.assign(num_comment_range = lambda x: x['count'].apply(map_count))
In [22]:
o = [e[0] for e in bins]
sns.pointplot(x='num_comment_range', y= 'aggressiveness', data= d_temp, order = o)
Out[22]:
In [23]:
# TODO: extend to attacks, use long term user data, repeat for victims